{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/text-similarity/mnli/multinli_1.0_dev_matched.json.translated\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/text-similarity/mnli/multinli_1.0_dev_mismatched.json.translated\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/text-similarity/mnli/multinli_1.0_train.json.translated"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/text-similarity/snli/snli_1.0_train.json.translate\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/text-similarity/snli/snli_1.0_test.json.translate\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/text-similarity/snli/snli_1.0_dev.json.translate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "import tensorflow as tf\n",
    "import json\n",
    "from pathlib import Path\n",
    "import re\n",
    "\n",
    "def cleaning(string):\n",
    "    string = string.replace('\\n', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['multinli_1.0_train.json.translated',\n",
       " 'multinli_1.0_dev_mismatched.json.translated',\n",
       " 'multinli_1.0_dev_matched.json.translated']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files = glob('multinli_1.0*')\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "f = open(files[-2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'gold_label': 'contradiction',\n",
       " 'promptID': '133794',\n",
       " 'pairID': '133794c',\n",
       " 'genre': 'verbatim',\n",
       " 'en': [\"The answer has nothing to do with their cause, however, but with the simple fact that dictionaries are not exercises in bi-unique substitutability; in other words, if one of the senses of run is `operate' (as in She runs an engine factory ), that does not make it valid to assume that one can substitute operate for run in We run in the marathon every year .  Although recognizing this as a shortcoming of dictionaries and assigning it arbitrarily to what, for lack of a better term, we might call the  genius  of the language, might seem trivial to the casual observer, it is a valid matter for concern in the realm of lexicology.\",\n",
       "  'Dictionaries are indeed exercises in bi-unique substitutability.'],\n",
       " 'ms': [\"Jawapannya tidak ada kaitan dengan tujuan mereka, tetapi dengan fakta sederhana bahawa kamus bukan latihan dalam penggantian dua unik; dengan kata lain, jika salah satu deria lari adalah `beroperasi' (seperti dalam She menjalankan kilang mesin), itu tidak menjadikannya sah untuk menganggap bahawa seseorang boleh menggantikan operasi untuk dijalankan di We berlari di maraton setiap tahun. Walaupun menyedari ini sebagai kekurangan kamus dan memberikannya secara sewenang-wenangnya kepada apa, kerana kekurangan istilah yang lebih baik, kita mungkin memanggil genius bahasa, mungkin kelihatan remeh bagi pemerhati kasual, itu adalah perkara yang sah untuk dikhawatirkan dalam bidang leksikologi.\",\n",
       "  'Kamus memang latihan dalam substitusi bi unik.']}"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "l = json.loads(next(f))\n",
    "l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "snli_1.0_test.json.translate\n",
      "snli_1.0_test.json.translate 10000\n",
      "snli_1.0_train.json.translate\n",
      "snli_1.0_train.json.translate 550152\n",
      "snli_1.0_dev.json.translate\n",
      "snli_1.0_dev.json.translate 10000\n"
     ]
    }
   ],
   "source": [
    "labels = {'contradiction': 'percanggahan', 'entailment': 'berkait'}\n",
    "\n",
    "filename = 'snli.tsv'\n",
    "files = glob('snli_1.0*')\n",
    "with tf.io.gfile.GFile(filename, 'w') as outfile:\n",
    "    for file in files:\n",
    "        print(file)\n",
    "        with open(file) as fopen:\n",
    "            data = json.load(fopen)\n",
    "        print(file, len(data))\n",
    "\n",
    "        s = Path(file).stem\n",
    "        for i in range(len(data)):\n",
    "            if len(data[i]['ms']) != 2:\n",
    "                continue\n",
    "\n",
    "            label = labels.get(data[i]['gold_label'], data[i]['gold_label'])\n",
    "            q = f\"ayat1: {cleaning(data[i]['ms'][0])} ayat2: {cleaning(data[i]['ms'][1])}\"\n",
    "            outfile.write('%s\\t%s\\n' % (q, label))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "multinli_1.0_train.json.translated\n",
      "multinli_1.0_dev_mismatched.json.translated\n",
      "multinli_1.0_dev_matched.json.translated\n"
     ]
    }
   ],
   "source": [
    "files = glob('multinli_1.0*')\n",
    "\n",
    "filename = 'mnli.tsv'\n",
    "with tf.io.gfile.GFile(filename, 'w') as outfile:\n",
    "    for file in files:\n",
    "        print(file)\n",
    "        with open(file) as fopen:\n",
    "            for l in fopen:\n",
    "                l = json.loads(l)\n",
    "                if len(l['ms']) != 2:\n",
    "                    continue\n",
    "                \n",
    "                label = labels.get(l['gold_label'], l['gold_label'])\n",
    "                q = f\"ayat1: {cleaning(l['ms'][0])} ayat2: {cleaning(l['ms'][1])}\"\n",
    "                outfile.write('%s\\t%s\\n' % (q, label))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import tensorflow_datasets as tfds\n",
    "from t5.data import preprocessors as prep\n",
    "import functools\n",
    "import t5\n",
    "import gin\n",
    "\n",
    "gin.parse_config_file('pretrained_models_base_operative_config.gin')\n",
    "vocab = 'sp10m.cased.ms-en.model'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "def dataset(split, shuffle_files = False):\n",
    "    del shuffle_files\n",
    "    ds = tf.data.TextLineDataset(['mnli.tsv', 'snli.tsv'])\n",
    "\n",
    "    ds = ds.map(\n",
    "        functools.partial(\n",
    "            tf.io.decode_csv,\n",
    "            record_defaults = ['', ''],\n",
    "            field_delim = '\\t',\n",
    "            use_quote_delim = False,\n",
    "        ),\n",
    "        num_parallel_calls = tf.data.experimental.AUTOTUNE,\n",
    "    )\n",
    "    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))\n",
    "    return ds\n",
    "\n",
    "def preprocessor(ds):\n",
    "    def to_inputs_and_targets(ex):\n",
    "        return {'inputs': ex['question'], 'targets': ex['answer']}\n",
    "\n",
    "    return ds.map(\n",
    "        to_inputs_and_targets,\n",
    "        num_parallel_calls = tf.data.experimental.AUTOTUNE,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = ['mnli.tsv', 'snli.tsv']\n",
    "\n",
    "t5.data.TaskRegistry.remove('similarity_dataset')\n",
    "t5.data.TaskRegistry.add(\n",
    "    'similarity_dataset',\n",
    "    dataset_fn = dataset,\n",
    "    splits = ['train'],\n",
    "    text_preprocessor = [preprocessor],\n",
    "    sentencepiece_model_path = vocab,\n",
    "    metric_fns = [t5.evaluation.metrics.accuracy],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "nq_task = t5.data.TaskRegistry.get(\"similarity_dataset\")\n",
    "ds = nq_task.get_dataset(split=file, sequence_length={\"inputs\": 1024, \"targets\": 1024})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "r = tfds.as_numpy(ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "next(r)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
